The goal is to create models that can forecast the sales for small basket across different locations and product categories.
Small Basket is a huge online / mobile application based grocery retailer in India, founded in 2011. *Small Basket is trying to manage its supply chain and delivery partners and would like to accurately forecast the sales for the period starting from 1st January 2019 to 15th January 2019.
Importing Required Libraries
import numpy as np
import pandas as pd
import os
import shutil
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from IPython.display import SVG
from IPython.display import display
#from graphviz import Source
from sklearn.tree import DecisionTreeClassifier, export_graphviz,DecisionTreeRegressor
from sklearn.metrics import accuracy_score, recall_score, precision_score,confusion_matrix,mean_absolute_error,mean_squared_error,classification_report
from keras.utils import to_categorical
from keras.models import Sequential, Model
from keras.layers import Input, Dense, LSTM, Embedding, Flatten, concatenate, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from IPython.display import Image
from keras import regularizers
# To filter/truncate warnings
import warnings
warnings.filterwarnings('ignore')
EDA
Pre-processing for EDA
Functions to Reduce Memory Consumption
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if col_type == 'datetime64[ns]':
print ("")
elif str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
import pandas as pd
from typing import List
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
floats = df.select_dtypes(include=['float64']).columns.tolist()
df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
return df
def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
ints = df.select_dtypes(include=['int64']).columns.tolist()
df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
return df
def optimize_objects(df: pd.DataFrame, datetime_features: List[str]) -> pd.DataFrame:
for col in df.select_dtypes(include=['object']):
if col not in datetime_features:
num_unique_values = len(df[col].unique())
num_total_values = len(df[col])
if float(num_unique_values) / num_total_values < 0.5:
df[col] = df[col].astype('category')
else:
df[col] = pd.to_datetime(df[col])
return df
def optimize(df: pd.DataFrame, datetime_features: List[str] = []):
return optimize_floats(optimize_ints(optimize_objects(df, datetime_features)))
Function to load files and call memory optimaztion function for them
def import_data(file):
"""create a dataframe and optimize its memory usage"""
dateparse = lambda x: pd.to_datetime(x)
df = pd.read_csv(file, parse_dates=['date'],date_parser=dateparse)
df = reduce_mem_usage(df)
return df
def import_other_data(file):
"""create a dataframe and optimize its memory usage"""
df = pd.read_csv(file)
df = reduce_mem_usage(df)
return df
Loading the train data
train = import_data('train.csv')
Understanding train data
def understand_data(data) :
return(pd.DataFrame({"Datatype":data.dtypes,"No of NAs":data.isna().sum(),"No of Levels":data.apply(lambda x: x.nunique(),axis=0),"Levels":data.apply(lambda x: str(x.unique()),axis=0)}))
understand_data(train)
Generatng new features out of the existing ones
import datetime
def process_data(data):
data['day'] = data.date.dt.day_name()
data['month'] = data.date.dt.strftime('%B')
data['year'] = (data.date.dt.year)
data['month_years']=data['date'].dt.to_period('M')
process_data(train)
print(train.memory_usage().sum() / 1024**2)
train=optimize_floats(train)
train=optimize_ints(train)
train=optimize_objects(train,[])
train=optimize(train)
print(train.memory_usage().sum() / 1024**2)
understand_data(train)
Handling Null Values
def total_NAs(data) :
print ("Total NAs:",data.isna().sum().sum())
Check total NAs in train data
total_NAs(train)
train.onpromotion.isna().sum()
Notice that we have NAs in train data. Train data has NAs for the attribute "onpromotion".
train.groupby(['year'])['onpromotion'].value_counts()
(train.groupby(['year']).agg({'onpromotion':'count'})).plot(kind='bar')
Notice that promotions are application only since 2016. Before 2016, they are not application.
So, we can impute missing values for promotion (from the year 2015) with False i.e., no promotion.
train.onpromotion.fillna(value=False,inplace=True)
Verify that there are no more NAs in train data
total_NAs(train)
train.year.unique()
EDA1: To check if sales are increased when items are sold on discounts/promotions
(train.groupby(['onpromotion','year']).agg({'unit_sales':'mean'})).plot(kind='bar')
Sales have increased when promotions are offered.
EDA2: To get aggregate sales data for each year across all locations
train_sub1=train.groupby('year').agg({'unit_sales':'sum'})
train_sub1
sns.barplot(train_sub1.index,train_sub1.unit_sales)
sns.pointplot(train_sub1.index,train_sub1.unit_sales)
train_sub1['Perc_inc']=100*(train_sub1['unit_sales'].pct_change())
train_sub1
sns.pointplot(train_sub1.iloc[1:,:].index,train_sub1.iloc[1:,:].Perc_inc)
Time series plots
from statsmodels.tsa.seasonal import seasonal_decompose
(train_sub1).plot(figsize=(10,6))
plt.show()
The above plot with year as index show trend in the data.
del train_sub1
gc.collect()
EDA3: To get aggregate sales data for each month for each of the 4 years
train_sub2_2015=train[train.year==2015].groupby('month').agg({'unit_sales':'sum'})
train_sub2_2016=train[train.year==2016].groupby('month').agg({'unit_sales':'sum'})
train_sub2_2017=train[train.year==2017].groupby('month').agg({'unit_sales':'sum'})
train_sub2_2018=train[train.year==2018].groupby('month').agg({'unit_sales':'sum'})
print(train_sub2_2015.shape)
print(train_sub2_2016.shape)
print(train_sub2_2017.shape)
print(train_sub2_2018.shape)
Top sales across all the months in each of the past 4 years
print(train_sub2_2015.sort_values(by='unit_sales',ascending=False).head(1))
print(train_sub2_2016.sort_values(by='unit_sales',ascending=False).head(1))
print(train_sub2_2017.sort_values(by='unit_sales',ascending=False).head(1))
print(train_sub2_2018.sort_values(by='unit_sales',ascending=False).head(1))
Least sales across all the months in each of the past 4 years
print(train_sub2_2015.sort_values(by='unit_sales',ascending=True).head(1))
print(train_sub2_2016.sort_values(by='unit_sales',ascending=True).head(1))
print(train_sub2_2017.sort_values(by='unit_sales',ascending=True).head(1))
print(train_sub2_2018.sort_values(by='unit_sales',ascending=True).head(1))
fig,ax = plt.subplots(2,2,figsize=(10,8))
fig.subplots_adjust(wspace=0.3,hspace=0.7)
sns.barplot(train_sub2_2015.index,train_sub2_2015.unit_sales,ax=ax[0][0])
ax[0][0].set_title('2015')
ax[0][0].set_xlabel('month')
ax[0][0].tick_params(axis ='x', rotation = 90)
ax[0][0].set_ylabel('unit_sales')
sns.barplot(train_sub2_2016.index,train_sub2_2016.unit_sales,ax=ax[0][1])
ax[0][1].set_title('2016')
ax[0][1].set_xlabel('month')
ax[0][1].tick_params(axis ='x', rotation = 90)
ax[0][1].set_ylabel('unit_sales')
sns.barplot(train_sub2_2017.index,train_sub2_2017.unit_sales,ax=ax[1][0])
ax[1][0].set_title('2017')
ax[1][0].set_xlabel('month')
ax[1][0].tick_params(axis ='x', rotation = 90)
ax[1][0].set_ylabel('unit_sales')
sns.barplot(train_sub2_2018.index,train_sub2_2018.unit_sales,ax=ax[1][1])
ax[1][1].set_title('2018')
ax[1][1].set_xlabel('month')
ax[1][1].tick_params(axis ='x', rotation = 90)
ax[1][1].set_ylabel('unit_sales')
fig.show()
fig,ax = plt.subplots(2,2,figsize=(10,8))
fig.subplots_adjust(wspace=0.3,hspace=0.7)
sns.pointplot(train_sub2_2015.index,train_sub2_2015.unit_sales,ax=ax[0][0])
ax[0][0].set_title('2015')
ax[0][0].set_xlabel('month')
ax[0][0].tick_params(axis ='x', rotation = 90)
ax[0][0].set_ylabel('unit_sales')
sns.pointplot(train_sub2_2016.index,train_sub2_2016.unit_sales,ax=ax[0][1])
ax[0][1].set_title('2016')
ax[0][1].set_xlabel('month')
ax[0][1].tick_params(axis ='x', rotation = 90)
ax[0][1].set_ylabel('unit_sales')
sns.pointplot(train_sub2_2017.index,train_sub2_2017.unit_sales,ax=ax[1][0])
ax[1][0].set_title('2017')
ax[1][0].set_xlabel('month')
ax[1][0].tick_params(axis ='x', rotation = 90)
ax[1][0].set_ylabel('unit_sales')
sns.pointplot(train_sub2_2018.index,train_sub2_2018.unit_sales,ax=ax[1][1])
ax[1][1].set_title('2018')
ax[1][1].set_xlabel('month')
ax[1][1].tick_params(axis ='x', rotation = 90)
ax[1][1].set_ylabel('unit_sales')
fig.show()
The Monthly pattern looks more similar over alternate years.
train_sub2_2015['Perc_inc']=100*(train_sub2_2015['unit_sales'].pct_change())
train_sub2_2016['Perc_inc']=100*(train_sub2_2016['unit_sales'].pct_change())
train_sub2_2017['Perc_inc']=100*(train_sub2_2017['unit_sales'].pct_change())
train_sub2_2018['Perc_inc']=100*(train_sub2_2018['unit_sales'].pct_change())
fig,ax = plt.subplots(2,2,figsize=(10,8))
fig.subplots_adjust(wspace=0.3,hspace=0.7)
sns.pointplot(train_sub2_2015.iloc[1:,:].index,train_sub2_2015.iloc[1:,:].Perc_inc,ax=ax[0][0])
ax[0][0].set_title('2015')
ax[0][0].set_xlabel('month')
ax[0][0].tick_params(axis ='x', rotation = 90)
ax[0][0].set_ylabel('unit_sales')
sns.pointplot(train_sub2_2016.iloc[1:,:].index,train_sub2_2015.iloc[1:,:].Perc_inc,ax=ax[0][1],sharey=True)
ax[0][1].set_title('2016')
ax[0][1].set_xlabel('month')
ax[0][1].tick_params(axis ='x', rotation = 90)
ax[0][1].set_ylabel('unit_sales')
sns.pointplot(train_sub2_2017.iloc[1:,:].index,train_sub2_2015.iloc[1:,:].Perc_inc,ax=ax[1][0],sharey=True)
ax[1][0].set_title('2017')
ax[1][0].set_xlabel('month')
ax[1][0].tick_params(axis ='x', rotation = 90)
ax[1][0].set_ylabel('unit_sales')
sns.pointplot(train_sub2_2018.iloc[1:,:].index,train_sub2_2015.iloc[1:,:].Perc_inc,ax=ax[1][1],sharey=True)
ax[1][1].set_title('2018')
ax[1][1].set_xlabel('month')
ax[1][1].tick_params(axis ='x', rotation = 90)
ax[1][1].set_ylabel('unit_sales')
fig.show()
Timeseries Plots
from statsmodels.tsa.seasonal import seasonal_decompose
fig,ax = plt.subplots(2,2,figsize=(30,15))
train_sub2_2015.plot(figsize=(10,6),ax=ax[0,0])
train_sub2_2016.plot(figsize=(10,6),ax=ax[0,1])
train_sub2_2017.plot(figsize=(10,6),ax=ax[1,0])
train_sub2_2018.plot(figsize=(10,6),ax=ax[1,1])
plt.show()
Though monthly seasonality exists the cycle is repeated for alternate years.
del [[train_sub2_2015,train_sub2_2016,train_sub2_2017,train_sub2_2018]]
gc.collect()
EDA4: To get aggregate sales data across each day of the week in each of the 12 months over the past 4 years
train_sub_month1=(train[train.month=='January'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month2=(train[train.month=='February'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month3=(train[train.month=='March'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month4=(train[train.month=='April'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month5=(train[train.month=='May'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month6=(train[train.month=='June'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month7=(train[train.month=='July'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month8=(train[train.month=='August'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month9=(train[train.month=='September'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month10=(train[train.month=='October'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month11=(train[train.month=='November'].groupby('day').agg({'unit_sales':'sum'}))
train_sub_month12=(train[train.month=='December'].groupby('day').agg({'unit_sales':'sum'}))
fig,ax = plt.subplots(3,4,figsize=(15,10))
fig.subplots_adjust(wspace=0.3,hspace=1)
sns.barplot(train_sub_month1.index,train_sub_month1.unit_sales,ax=ax[0][0])
ax[0][0].set_title('January')
ax[0][0].set_xlabel('day')
ax[0][0].tick_params(axis ='x', rotation = 90)
ax[0][0].set_ylabel('unit_sales')
sns.barplot(train_sub_month2.index,train_sub_month2.unit_sales,ax=ax[0][1])
ax[0][1].set_title('February')
ax[0][1].set_xlabel('day')
ax[0][1].tick_params(axis ='x', rotation = 90)
ax[0][1].set_ylabel('unit_sales')
sns.barplot(train_sub_month3.index,train_sub_month3.unit_sales,ax=ax[0][2])
ax[0][2].set_title('March')
ax[0][2].set_xlabel('day')
ax[0][2].tick_params(axis ='x', rotation = 90)
ax[0][2].set_ylabel('unit_sales')
sns.barplot(train_sub_month4.index,train_sub_month4.unit_sales,ax=ax[0][3])
ax[0][3].set_title('April')
ax[0][3].set_xlabel('day')
ax[0][3].tick_params(axis ='x', rotation = 90)
ax[0][3].set_ylabel('unit_sales')
fig.show()
sns.barplot(train_sub_month5.index,train_sub_month5.unit_sales,ax=ax[1][0])
ax[1][0].set_title('May')
ax[1][0].set_xlabel('day')
ax[1][0].tick_params(axis ='x', rotation = 90)
ax[1][0].set_ylabel('unit_sales')
fig.show()
sns.barplot(train_sub_month6.index,train_sub_month6.unit_sales,ax=ax[1][1])
ax[1][1].set_title('June')
ax[1][1].set_xlabel('day')
ax[1][1].tick_params(axis ='x', rotation = 90)
ax[1][1].set_ylabel('unit_sales')
fig.show()
sns.barplot(train_sub_month7.index,train_sub_month7.unit_sales,ax=ax[1][2])
ax[1][2].set_title('July')
ax[1][2].set_xlabel('day')
ax[1][2].tick_params(axis ='x', rotation = 90)
ax[1][2].set_ylabel('unit_sales')
fig.show()
sns.barplot(train_sub_month8.index,train_sub_month8.unit_sales,ax=ax[1][3])
ax[1][3].set_title('August')
ax[1][3].set_xlabel('day')
ax[1][3].tick_params(axis ='x', rotation = 90)
ax[1][3].set_ylabel('unit_sales')
fig.show()
sns.barplot(train_sub_month9.index,train_sub_month9.unit_sales,ax=ax[2][0])
ax[2][0].set_title('September')
ax[2][0].set_xlabel('day')
ax[2][0].tick_params(axis ='x', rotation = 90)
ax[2][0].set_ylabel('unit_sales')
fig.show()
sns.barplot(train_sub_month10.index,train_sub_month10.unit_sales,ax=ax[2][1])
ax[2][1].set_title('October')
ax[2][1].set_xlabel('day')
ax[2][1].tick_params(axis ='x', rotation = 90)
ax[2][1].set_ylabel('unit_sales')
fig.show()
sns.barplot(train_sub_month11.index,train_sub_month11.unit_sales,ax=ax[2][2])
ax[2][2].set_title('November')
ax[2][2].set_xlabel('day')
ax[2][2].tick_params(axis ='x', rotation = 90)
ax[2][2].set_ylabel('unit_sales')
fig.show()
sns.barplot(train_sub_month12.index,train_sub_month12.unit_sales,ax=ax[2][3])
ax[2][3].set_title('December')
ax[2][3].set_xlabel('day')
ax[2][3].tick_params(axis ='x', rotation = 90)
ax[2][3].set_ylabel('unit_sales')
fig.show()
fig,ax = plt.subplots(3,4,figsize=(15,10))
fig.subplots_adjust(wspace=0.3,hspace=1)
sns.pointplot(train_sub_month1.index,train_sub_month1.unit_sales,ax=ax[0][0])
ax[0][0].set_title('January')
ax[0][0].set_xlabel('day')
ax[0][0].tick_params(axis ='x', rotation = 90)
ax[0][0].set_ylabel('unit_sales')
sns.pointplot(train_sub_month2.index,train_sub_month2.unit_sales,ax=ax[0][1])
ax[0][1].set_title('February')
ax[0][1].set_xlabel('day')
ax[0][1].tick_params(axis ='x', rotation = 90)
ax[0][1].set_ylabel('unit_sales')
sns.pointplot(train_sub_month3.index,train_sub_month3.unit_sales,ax=ax[0][2])
ax[0][2].set_title('March')
ax[0][2].set_xlabel('day')
ax[0][2].tick_params(axis ='x', rotation = 90)
ax[0][2].set_ylabel('unit_sales')
sns.pointplot(train_sub_month4.index,train_sub_month4.unit_sales,ax=ax[0][3])
ax[0][3].set_title('April')
ax[0][3].set_xlabel('day')
ax[0][3].tick_params(axis ='x', rotation = 90)
ax[0][3].set_ylabel('unit_sales')
fig.show()
sns.pointplot(train_sub_month5.index,train_sub_month5.unit_sales,ax=ax[1][0])
ax[1][0].set_title('May')
ax[1][0].set_xlabel('day')
ax[1][0].tick_params(axis ='x', rotation = 90)
ax[1][0].set_ylabel('unit_sales')
fig.show()
sns.pointplot(train_sub_month6.index,train_sub_month6.unit_sales,ax=ax[1][1])
ax[1][1].set_title('June')
ax[1][1].set_xlabel('day')
ax[1][1].tick_params(axis ='x', rotation = 90)
ax[1][1].set_ylabel('unit_sales')
fig.show()
sns.pointplot(train_sub_month7.index,train_sub_month7.unit_sales,ax=ax[1][2])
ax[1][2].set_title('July')
ax[1][2].set_xlabel('day')
ax[1][2].tick_params(axis ='x', rotation = 90)
ax[1][2].set_ylabel('unit_sales')
fig.show()
sns.pointplot(train_sub_month8.index,train_sub_month8.unit_sales,ax=ax[1][3])
ax[1][3].set_title('August')
ax[1][3].set_xlabel('day')
ax[1][3].tick_params(axis ='x', rotation = 90)
ax[1][3].set_ylabel('unit_sales')
fig.show()
sns.pointplot(train_sub_month9.index,train_sub_month9.unit_sales,ax=ax[2][0])
ax[2][0].set_title('September')
ax[2][0].set_xlabel('day')
ax[2][0].tick_params(axis ='x', rotation = 90)
ax[2][0].set_ylabel('unit_sales')
fig.show()
sns.pointplot(train_sub_month10.index,train_sub_month10.unit_sales,ax=ax[2][1])
ax[2][1].set_title('October')
ax[2][1].set_xlabel('day')
ax[2][1].tick_params(axis ='x', rotation = 90)
ax[2][1].set_ylabel('unit_sales')
fig.show()
sns.pointplot(train_sub_month11.index,train_sub_month11.unit_sales,ax=ax[2][2])
ax[2][2].set_title('November')
ax[2][2].set_xlabel('day')
ax[2][2].tick_params(axis ='x', rotation = 90)
ax[2][2].set_ylabel('unit_sales')
fig.show()
sns.pointplot(train_sub_month12.index,train_sub_month12.unit_sales,ax=ax[2][3])
ax[2][3].set_title('December')
ax[2][3].set_xlabel('day')
ax[2][3].tick_params(axis ='x', rotation = 90)
ax[2][3].set_ylabel('unit_sales')
fig.show()
Time Series Plots
from statsmodels.tsa.seasonal import seasonal_decompose
fig,ax = plt.subplots(3,4,figsize=(40,20))
fig.subplots_adjust(wspace=0.3,hspace=1)
train_sub_month1.plot(figsize=(15,6),ax=ax[0,0])
train_sub_month2.plot(figsize=(15,6),ax=ax[0,1])
train_sub_month3.plot(figsize=(15,6),ax=ax[0,2])
train_sub_month4.plot(figsize=(15,6),ax=ax[0,3])
train_sub_month5.plot(figsize=(15,6),ax=ax[1,0])
train_sub_month6.plot(figsize=(15,6),ax=ax[1,1])
train_sub_month7.plot(figsize=(15,6),ax=ax[1,2])
train_sub_month8.plot(figsize=(15,6),ax=ax[1,3])
train_sub_month9.plot(figsize=(15,6),ax=ax[2,0])
train_sub_month10.plot(figsize=(15,6),ax=ax[2,1])
train_sub_month11.plot(figsize=(15,6),ax=ax[2,2])
train_sub_month12.plot(figsize=(15,6),ax=ax[2,3])
plt.show()
del [[train_sub_month1, train_sub_month2, train_sub_month3, train_sub_month4, train_sub_month5, train_sub_month6,
train_sub_month7, train_sub_month8, train_sub_month9, train_sub_month10, train_sub_month11, train_sub_month12]]
gc.collect()
EDA5: To get aggregate sales data across different locations over all 4 years
train_sub3_2015=train[train.year==2015].groupby('locationId').agg({'unit_sales':'sum'})
train_sub3_2016=train[train.year==2016].groupby('locationId').agg({'unit_sales':'sum'})
train_sub3_2017=train[train.year==2017].groupby('locationId').agg({'unit_sales':'sum'})
train_sub3_2018=train[train.year==2018].groupby('locationId').agg({'unit_sales':'sum'})
print(train_sub3_2015.shape)
print(train_sub3_2016.shape)
print(train_sub3_2017.shape)
print(train_sub3_2018.shape)
Top sales across different locations over different years
print(train_sub3_2015.sort_values(by='unit_sales',ascending=False).head(5))
print(train_sub3_2016.sort_values(by='unit_sales',ascending=False).head(5))
print(train_sub3_2017.sort_values(by='unit_sales',ascending=False).head(5))
print(train_sub3_2018.sort_values(by='unit_sales',ascending=False).head(5))
Least sales across different locations over different years
print(train_sub3_2015.sort_values(by='unit_sales',ascending=True).head(5))
print(train_sub3_2016.sort_values(by='unit_sales',ascending=True).head(5))
print(train_sub3_2017.sort_values(by='unit_sales',ascending=True).head(5))
print(train_sub3_2018.sort_values(by='unit_sales',ascending=True).head(5))
Top sales across the years are usually seen in the locations - 44, 45, 47, 3, 46 and 49.
Least sales across years are usually seen in the locations - 20, 21, 22, 29, 42, 26, 30, 32 and 42.
fig,ax = plt.subplots(2,2,figsize=(20,10))
fig.subplots_adjust(wspace=0.1,hspace=0.5)
sns.barplot(train_sub3_2015.index,train_sub3_2015.unit_sales,ax=ax[0][0])
ax[0][0].set_title('2015')
ax[0][0].set_xlabel('month')
ax[0][0].tick_params(axis ='x', rotation = 90)
ax[0][0].set_ylabel('unit_sales')
sns.barplot(train_sub3_2016.index,train_sub3_2016.unit_sales,ax=ax[0][1])
ax[0][1].set_title('2016')
ax[0][1].set_xlabel('month')
ax[0][1].tick_params(axis ='x', rotation = 90)
ax[0][1].set_ylabel('unit_sales')
sns.barplot(train_sub3_2017.index,train_sub3_2017.unit_sales,ax=ax[1][0])
ax[1][0].set_title('2017')
ax[1][0].set_xlabel('month')
ax[1][0].tick_params(axis ='x', rotation = 90)
ax[1][0].set_ylabel('unit_sales')
sns.barplot(train_sub3_2018.index,train_sub3_2018.unit_sales,ax=ax[1][1])
ax[1][1].set_title('2018')
ax[1][1].set_xlabel('month')
ax[1][1].tick_params(axis ='x', rotation = 90)
ax[1][1].set_ylabel('unit_sales')
fig.show()
The overall sales pattern is uniform across different locations over the past 4 years.
del [[train_sub3_2015, train_sub3_2016, train_sub3_2017, train_sub3_2018]]
gc.collect()
EDA6: To get monthly aggregate sales data across all the locations in the past 4 years
train_sub4=train.groupby('month_years').agg({'unit_sales':'sum'})
train_sub5.shape
Top unit sales across each month of all the 4 years i.e., across 12*4 = 48 months
train_sub4.sort_values(by='unit_sales',ascending=False).head(10)
Least unit sales across each month of all the 4 years i.e., across 12*4 = 48 months
train_sub4.sort_values(by='unit_sales',ascending=True).head(10)
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,7))
sns.barplot(train_sub4.index,train_sub4.unit_sales)
plt.xlabel('month_year')
plt.xticks(rotation='vertical')
plt.ylabel('unit_sales',)
plt.title('unit sales for each month of all the 4 years')
plt.show()
Notice that there is an increase in sales over years, with some monthly fluctuations.
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,7))
sns.pointplot(train_sub4.index,train_sub4.unit_sales)
plt.xlabel('month_year')
plt.xticks(rotation='vertical')
plt.ylabel('unit_sales',)
plt.title('unit sales for each month of all the 4 years')
plt.show()
Clearly there is yearly treand with monthly seasonality in the data.
Time series plots
from statsmodels.tsa.seasonal import seasonal_decompose
(train_sub4).plot(figsize=(10,6))
plt.show()
This is a strong indication of trend over 4 years and seasonality across months.
from pylab import rcParams
import statsmodels.api as sm
rcParams['figure.figsize'] = 10, 6
decomposition = sm.tsa.seasonal_decompose(train_sub4, model = 'additive', freq= 12)
fig = decomposition.plot()
plt.show()
from pandas import Series
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
plt.figure()
plt.subplot(211)
plot_acf(train_sub4, ax=plt.gca())
plt.subplot(212)
plot_pacf(train_sub4, ax=plt.gca())
plt.show()
del [train_sub4]
gc.collect()
Loading item details data
os.chdir(r"/content/drive/My Drive/PHD")
item_details=import_other_data('item_details.csv')
Loading location details data
location_details=import_other_data('location_details.csv')
Understanding item_details data
understand_data(item_details)
Understanding location_details data
understand_data(location_details)
check total NAs in item_details data
total_NAs(item_details)
check total NAs in location_details data
total_NAs(location_details)
There are no NAs in both the datasets.
Check for any duplicate records in item_details, location_details
item_details[item_details.item_id.duplicated(keep = False)].sort_values('item_id')
location_details[location_details.location_id.duplicated(keep = False)].sort_values('location_id')
Joining datasets
train.columns
item_details.columns
location_details.columns
Changing the mismatched column names to perform join
location_details.columns=[x.replace('location_id','locationId') for x in location_details.columns]
Joining train, item_details dataframes
merged_data1=train.merge(item_details,how='left',on=['item_id'])
total_NAs(merged_data1)
print(train.shape);print(merged_data1.shape)
merged_data1.columns
merged_data1.head()
gc.collect()
Joining train, item_details dataframes and location_details
merged_data1=merged_data1.merge(location_details,how='left',on=['locationId'])
total_NAs(merged_data1)
print(merged_data1.shape)
merged_data1.columns
merged_data1.head()
print(merged_data1.memory_usage().sum() / 1024**2)
merged_data1=optimize_floats(merged_data1)
merged_data1=optimize_ints(merged_data1)
merged_data1=optimize_objects(merged_data1,[])
merged_data1=optimize(merged_data1)
print(merged_data1.memory_usage().sum() / 1024**2)
Taking backup of merged data
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('merged_data1',merged_data1)
Verify whether the backup files are created
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
print(os.getcwd())
!ls -lrt
Restore backedup merged data
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
merged_data1=reload_backupfiles('merged_data1')
del [[train, item_details, location_details]]
gc.collect()
EDA7: To get the clusters with top and least sales for each year
train_sub5=merged_data1.groupby(['year','cluster']).agg({'unit_sales':'sum'}).reset_index()
train_sub5
plt.figure(figsize=(15,7))
sns.barplot(x='year',y='unit_sales',hue='cluster',data=train_sub5)
Cluster 14 recorded higheset sales across all the 4 years.
print("\n5 top sales clusters in 2015\n")
print(train_sub5[train_sub5.year==2015].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least sales clusters in 2015\n")
print(train_sub5[train_sub5.year==2015].sort_values(by='unit_sales',ascending=True).head(5))
print("\n5 top sales clusters in 2016\n")
print(train_sub5[train_sub5.year==2016].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least sales clusters in 2016\n")
print(train_sub5[train_sub5.year==2016].sort_values(by='unit_sales',ascending=True).head(5))
print("\n5 top sales clusters in 2017\n")
print(train_sub5[train_sub5.year==2017].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least sales clusters in 2017\n")
print(train_sub5[train_sub5.year==2017].sort_values(by='unit_sales',ascending=True).head(5))
print("\n5 top sales clusters in 2018\n")
print(train_sub5[train_sub5.year==2018].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least sales clusters in 2018\n")
print(train_sub5[train_sub5.year==2018].sort_values(by='unit_sales',ascending=True).head(5))
train_sub5_2015=(train_sub5[train_sub5.year==2016].sort_values(by='unit_sales',ascending=False).head(5)).set_index('cluster')
train_sub5_2016=(train_sub5[train_sub5.year==2016].sort_values(by='unit_sales',ascending=False).head(5)).set_index('cluster')
train_sub5_2017=(train_sub5[train_sub5.year==2017].sort_values(by='unit_sales',ascending=False).head(5)).set_index('cluster')
train_sub5_2018=(train_sub5[train_sub5.year==2018].sort_values(by='unit_sales',ascending=False).head(5)).set_index('cluster')
fig,ax = plt.subplots(2,2,figsize=(10,8))
fig.subplots_adjust(wspace=0.5,hspace=0.5)
(train_sub5_2015).plot(kind='bar',ax=ax[0][0])
ax[0][0].set_title('2015')
ax[0][0].set_xlabel('cluster')
ax[0][0].tick_params(axis ='x', rotation = 45)
ax[0][0].set_ylabel('unit_sales')
(train_sub5_2016).plot(kind='bar',ax=ax[0][1])
ax[0][1].set_title('2016')
ax[0][1].set_xlabel('cluster')
ax[0][1].tick_params(axis ='x', rotation = 45)
ax[0][1].set_ylabel('unit_sales')
(train_sub5_2017).plot(kind='bar',ax=ax[1][0])
ax[1][0].set_title('2017')
ax[1][0].set_xlabel('cluster')
ax[1][0].tick_params(axis ='x', rotation = 45)
ax[1][0].set_ylabel('unit_sales')
(train_sub5_2018).plot(kind='bar',ax=ax[1][1])
ax[1][1].set_title('2018')
ax[1][1].set_xlabel('cluster')
ax[1][1].tick_params(axis ='x', rotation = 45)
ax[1][1].set_ylabel('unit_sales')
fig.show()
Max sales are mostly from the same clusters across 4 years. Min sales are mostly from the same customers across 4 years.
del [[train_sub5,train_sub5_2015,train_sub5_2016,train_sub5_2017, train_sub5_2018]]
gc.collect()
EDA8: To get the business unit that recorded maximum sales over all the 4 years.
train_sub6=merged_data1.groupby(['year','cluster','type']).agg({'unit_sales':'sum'}).reset_index().dropna()
train_sub6.head()
plt.figure(figsize=(15,7))
sns.boxplot(x='year',y='unit_sales',hue='type',data=train_sub6)
Business unit A accounts to highest sales across all the 4 years.
EDA9: To know the business unit with top sales across all the clusters
plt.figure(figsize=(15,7))
sns.barplot(x='cluster',y='unit_sales',hue='type',data=train_sub6)
Business units of type 'A' belonging to cluster 14 have recorded maximum sales.
del train_sub6
gc.collect()
EDA10: To get the average unit sales of each item category per Business unit
merged_data1.groupby(['category_of_item','type'])['unit_sales'].mean().unstack().plot(kind = 'bar', figsize = (15,6))
plt.title('Average unit_sales of each item category per Business unit')
plt.ylabel('unit_sales')
Prepared_food items in business unit A has highest mean sales.
EDA 11: To get the most sold items and least sold items for each year
train_sub7=merged_data1.groupby(['year','category_of_item']).agg({'unit_sales':'sum'}).reset_index().dropna()
train_sub7
plt.figure(figsize=(15,7))
sns.barplot(x='year',y='unit_sales',hue='category_of_item',data=train_sub7)
plt.legend(loc='upper right')
Notice that grocerry items are sold the most in all the years.
print("\n5 top sales item categories in 2015\n")
print(train_sub7[train_sub7.year==2015].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least sales item categories in 2015\n")
print(train_sub7[train_sub7.year==2015].sort_values(by='unit_sales',ascending=True).head(5))
print("\n5 top sales item categories in 2016\n")
print(train_sub7[train_sub7.year==2016].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least item categories in 2016\n")
print(train_sub7[train_sub7.year==2016].sort_values(by='unit_sales',ascending=True).head(5))
print("\n5 top sales item categories in 2017\n")
print(train_sub7[train_sub7.year==2017].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least item categories in 2017\n")
print(train_sub7[train_sub7.year==2017].sort_values(by='unit_sales',ascending=True).head(5))
print("\n5 top sales item categories in 2018\n")
print(train_sub7[train_sub7.year==2018].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least item categories in 2018\n")
print(train_sub7[train_sub7.year==2018].sort_values(by='unit_sales',ascending=True).head(5))
train_sub7_2015=(train_sub7[train_sub7.year==2016].sort_values(by='unit_sales',ascending=False).head(5)).set_index('category_of_item')
train_sub7_2016=(train_sub7[train_sub7.year==2016].sort_values(by='unit_sales',ascending=False).head(5)).set_index('category_of_item')
train_sub7_2017=(train_sub7[train_sub7.year==2017].sort_values(by='unit_sales',ascending=False).head(5)).set_index('category_of_item')
train_sub7_2018=(train_sub7[train_sub7.year==2018].sort_values(by='unit_sales',ascending=False).head(5)).set_index('category_of_item')
fig,ax = plt.subplots(2,2,figsize=(10,8))
fig.subplots_adjust(wspace=0.5,hspace=0.5)
(train_sub7_2015).plot(kind='bar',ax=ax[0][0])
ax[0][0].set_title('2015')
ax[0][0].set_xlabel('item category')
ax[0][0].tick_params(axis ='x', rotation = 45)
ax[0][0].set_ylabel('unit_sales')
(train_sub7_2016).plot(kind='bar',ax=ax[0][1])
ax[0][1].set_title('2016')
ax[0][1].set_xlabel('item category')
ax[0][1].tick_params(axis ='x', rotation = 45)
ax[0][1].set_ylabel('unit_sales')
(train_sub7_2017).plot(kind='bar',ax=ax[1][0])
ax[1][0].set_title('2017')
ax[1][0].set_xlabel('item category')
ax[1][0].tick_params(axis ='x', rotation = 45)
ax[1][0].set_ylabel('unit_sales')
(train_sub7_2018).plot(kind='bar',ax=ax[1][1])
ax[1][1].set_title('2018')
ax[1][1].set_xlabel('item category')
ax[1][1].tick_params(axis ='x', rotation = 45)
ax[1][1].set_ylabel('unit_sales')
fig.show()
Notice that groceries, beverages, dairy products, farm produces and cleaning utensils are sold the most in all the 4 years. Small basket should make sure these products supply meet the existing high demand.
They should also make sure that items with least sales are not stocked in surplus.
del [[train_sub7,train_sub7_2015,train_sub7_2016,train_sub7_2017,train_sub7_2018]]
gc.collect()
Top sales are mostly amongst same class i.e., group of items across the 4 years
EDA 12: Cities which recorded high sales for each item categories for each year
train_sub8=merged_data1.groupby(['year','city','category_of_item']).agg({'unit_sales':'sum'}).reset_index().dropna()
train_sub8
print("\n5 top selling item categories across each city in 2015\n")
print(train_ts_sub8[train_ts_sub8.year==2015].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least selling item categories across each city city in 2015\n")
print(train_ts_sub8[train_ts_sub8.year==2015].sort_values(by='unit_sales',ascending=True).head(5))
print("\n5 top selling item categories across each city in 2016\n")
print(train_ts_sub8[train_ts_sub8.year==2016].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least selling item categories across each city in 2016\n")
print(train_ts_sub8[train_ts_sub8.year==2016].sort_values(by='unit_sales',ascending=True).head(5))
print("\n5 top selling item categories across each city in 2017\n")
print(train_ts_sub8[train_ts_sub8.year==2017].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least selling item categories across each city in 2017\n")
print(train_ts_sub8[train_ts_sub8.year==2017].sort_values(by='unit_sales',ascending=True).head(5))
print("\n5 top selling item categories across each city in 2018\n")
print(train_ts_sub8[train_ts_sub8.year==2018].sort_values(by='unit_sales',ascending=False).head(5))
print("\n5 least selling item categories across each city in 2018\n")
print(train_ts_sub8[train_ts_sub8.year==2018].sort_values(by='unit_sales',ascending=True).head(5))
train_ts_sub8_2015=(train_ts_sub8[train_ts_sub8.year==2016].sort_values(by='unit_sales',ascending=False).head(5)).reset_index()
train_ts_sub8_2016=(train_ts_sub8[train_ts_sub8.year==2016].sort_values(by='unit_sales',ascending=False).head(5)).reset_index()
train_ts_sub8_2017=(train_ts_sub8[train_ts_sub8.year==2017].sort_values(by='unit_sales',ascending=False).head(5)).reset_index()
train_ts_sub8_2018=(train_ts_sub8[train_ts_sub8.year==2018].sort_values(by='unit_sales',ascending=False).head(5)).reset_index()
train_ts_sub8_2015['city'] = train_ts_sub8_2015.city.cat.remove_unused_categories()
train_ts_sub8_2016['city'] = train_ts_sub8_2016.city.cat.remove_unused_categories()
train_ts_sub8_2017['city'] = train_ts_sub8_2017.city.cat.remove_unused_categories()
train_ts_sub8_2018['city'] = train_ts_sub8_2018.city.cat.remove_unused_categories()
train_ts_sub8_2015['category_of_item']=train_ts_sub8_2015.category_of_item.cat.remove_unused_categories()
train_ts_sub8_2016['category_of_item']=train_ts_sub8_2016.category_of_item.cat.remove_unused_categories()
train_ts_sub8_2017['category_of_item']=train_ts_sub8_2017.category_of_item.cat.remove_unused_categories()
train_ts_sub8_2018['category_of_item']=train_ts_sub8_2018.category_of_item.cat.remove_unused_categories()
fig,ax = plt.subplots(2,2,figsize=(10,8))
fig.subplots_adjust(wspace=0.5,hspace=0.5)
sns.barplot(x='category_of_item',y='unit_sales',hue='city',data=train_ts_sub8_2015,ax=ax[0][0])
ax[0][0].set_title('2015')
ax[0][0].set_xlabel('category_of_item')
ax[0][0].tick_params(axis ='x', rotation = 45)
ax[0][0].set_ylabel('unit_sales')
sns.barplot(x='category_of_item',y='unit_sales',hue='city',data=train_ts_sub8_2016,ax=ax[0][1])
ax[0][1].set_title('2016')
ax[0][1].set_xlabel('category_of_item')
ax[0][1].tick_params(axis ='x', rotation = 45)
ax[0][1].set_ylabel('unit_sales')
sns.barplot(x='category_of_item',y='unit_sales',hue='city',data=train_ts_sub8_2017,ax=ax[1][0])
ax[1][0].set_title('2017')
ax[1][0].set_xlabel('category_of_item')
ax[1][0].tick_params(axis ='x', rotation = 45)
ax[1][0].set_ylabel('unit_sales')
sns.barplot(x='category_of_item',y='unit_sales',hue='city',data=train_ts_sub8_2018,ax=ax[1][1])
ax[1][1].set_title('2018')
ax[1][1].set_xlabel('category_of_item')
ax[1][1].tick_params(axis ='x', rotation = 45)
ax[1][1].set_ylabel('unit_sales')
fig.show()
High sales for most sold items are seen in Bangalore and Mumbai.
Small basket should make sure it fills the warehouses in these 2 locations on priority.
del [[train_sub8,train_sub8_2015,train_sub8_2016,train_sub8_2017,train_sub8_2018]]
gc.collect()
EDA13: sales trend over the years for perishable versus non-perishable items
train_sub9=merged_data1.groupby(['year','perishable']).agg({'unit_sales':'sum'}).reset_index().dropna()
train_sub9.head()
plt.figure(figsize=(15,7))
sns.barplot(x='year',y='unit_sales',hue='perishable',data=train_sub9)
del train_sub9
gc.collect()
Loading and processing train_transactions data
os.chdir(r"/content/drive/My Drive/PHD")
train_transactions=pd.read_csv('train_transactions.csv')
train_transactions=reduce_mem_usage(train_transactions)
understand_data(train_transactions)
train_transactions['date']=pd.to_datetime(train_transactions['date'])
train_transactions['year']=train_transactions.date.dt.year
train_sub10=train_transactions.groupby(['location_identifier','year']).agg({'transactions':'sum'}).reset_index().dropna()
train_sub10.head()
EDA14: To see whether total transactions at each location have increased or decreased over years
list_check = train_sub10['location_identifier']
plt.figure(figsize = (20,160))
plt.subplots_adjust(wspace=0.2,hspace=0.5)
for j,i in enumerate(list_check, start=1):
sub = train_sub10[(train_sub10.location_identifier == i)]
plt.subplot(53,4,j)
sns.pointplot(train_sub10['year'],train_sub10['transactions'])
plt.title(f'{i}')
plt.show()
-------------------------------------------------- EDA Finished
Data Pre-processing for statistical and machine learning models
Function to reduce memory consumption
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if col_type == 'datetime64[ns]':
print ("")
elif str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
Loading train data
os.chdir(r"/content/drive/My Drive/PHD")
train=pd.read_csv('train.csv')
In EDA it has observed that there is some treand over the years and strong month seasonality.
Also, monthly seasonlity appers to be repeating over alternate years.
Aim is to come up with a model that can do sales forecast for the first 15 days of January.
Working on data from Dec 2017 to Feb 2018 so that along with seasonality trend before and after January (previous year) is captured.
Taking subset of data from the entire train data
train=train[(train['date'] >= '2017-12-01') & (train['date'] <= '2018-02-31')]
Checking the dimensions of the reduced train data
print(train.shape)
For any time series data we should have continuous data.
Since we took three months data - Dec, Jan and Feb, the expectation is to have 90 unique dates.
print(train.date.unique())
print(len(train.date.unique()))
Notice that 25th December 2017 is missing, imputing missing date
import datetime
from datetime import date, timedelta
#add missing date
min_date = min(train.date)
max_date = max(train.date)
calendar = []
d1 = datetime.datetime.strptime(min_date, '%Y-%m-%d')
d2 = datetime.datetime.strptime(max_date, '%Y-%m-%d')
delta = d2 - d1 # timedelta
for i in range(delta.days + 1):
calendar.append(datetime.date.strftime(d1 + timedelta(days=i), '%Y-%m-%d'))
calendar = pd.DataFrame({'date':calendar})
train = calendar.merge(train, left_on='date', right_on='date', how='left')
Check that the missing date is now present in train data after imputation
print(train.date.unique())
print(len(train.date.unique()))
train.shape
train.head(10)
train.dtypes
Memory optimization for train data
locationId_unique = train['locationId'].unique()
locationId_mapping = {locationId: idx for
idx, locationId in
enumerate(locationId_unique, 1)}
train['locationId'] = train['locationId'].map(locationId_mapping)
print(train.locationId.min())
print(train.locationId.max())
print("int8 datatype can handle integer values ranging from {} to {}".format(np.iinfo(np.int8).min,np.iinfo(np.int8).max))
train['locationId'] = train['locationId'].astype("int8")
item_id_unique = train['item_id'].unique()
item_id_mapping = {item_id: idx for
idx, item_id in
enumerate(item_id_unique, 1)}
train['item_id'] = train['item_id'].map(item_id_mapping)
print(train.item_id.min())
print(train.item_id.max())
print("int16 datatype can handle integer values ranging from {} to {}".format(np.iinfo(np.int16).min,np.iinfo(np.int16).max))
train['item_id'] = train['item_id'].astype("int16")
Modifying the existing features and generatng new features out of the existing ones
import datetime
def process_data(data):
data.date=pd.to_datetime(data['date'])
data['day'] = data.date.dt.day_name()
data['month'] = data.date.dt.strftime('%B')
data['year'] = data.date.dt.year
process_data(train)
train.head()
Dropping unwanted columns
train.drop(['date'],axis=1,inplace=True)
train.dtypes
Further memory optimaztion to reduce file size
train = reduce_mem_usage(train)
train.dtypes
Understanding train data
def understand_data(data) :
return(pd.DataFrame({"Datatype":data.dtypes,"No of NAs":data.isna().sum(),"No of Levels":data.apply(lambda x: x.nunique(),axis=0),"Levels":data.apply(lambda x: str(x.unique()),axis=0)}))
understand_data(train)
Handling Null Values
def total_NAs(data) :
print ("Total NAs:",data.isna().sum().sum())
Check total NAs in train data
total_NAs(train)
print(train.isnull().sum(), '\n')
For the imputed date record, other values are missing.
Imputing values for locationId, item_id, onpromotion and unit sales for the imputed data
#Check index to apply the formula
na_index_train = train[train['unit_sales'].isnull() == True].index.values
#Define the index to use to apply the formala
na_index_train_plus = na_index_train.copy()
na_index_train_minus = np.maximum(0, na_index_train-1)
for i in range(len(na_index_train)):
k = 1
while (na_index_train[min(i+k,len(na_index_train)-1)] == na_index_train[i]+k):
k += 1
na_index_train_plus[i] = min(len(train)-1, na_index_train_plus[i] + k )
#Apply the formula
for i in range(len(na_index_train)):
if (na_index_train[i] == 0):
train.loc[na_index_train[i], 'unit_sales'] = train.loc[na_index_train_plus[i], 'unit_sales']
elif (na_index_train[i] == len(train)):
train.loc[na_index_train[i], 'unit_sales'] = train.loc[na_index_train_minus[i], 'unit_sales']
else:
train.loc[na_index_train[i], 'unit_sales'] = (train.loc[na_index_train_plus[i], 'unit_sales'] + train.loc[na_index_train_minus[i], 'unit_sales'])/ 2
train.onpromotion.fillna(value=False,inplace=True)
print(train.isnull().sum(), '\n')
Checking summary statistics
train.describe(include='all')
Notice that unit_sales have negative values too.They can be considered as products returned.
So, let us retain them and us them as they are.
Creating a folder in colab to take save backedup data
PATH='/content/drive/My Drive/PHD/MemoryReducedFiles'
if os.path.exists(PATH):
shutil.rmtree(PATH)
os.mkdir(PATH)
Taking text backup and storing backedup files in the created folder path
def create_backupfiles(filename,dfin):
outfile=open(filename,'wb')
pkl.dump(dfin,outfile)
outfile.close()
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('train',train)
Reload backedup data
def reload_backupfiles(filename):
infile=open(filename,'rb')
dfout=pkl.load(infile)
infile.close()
return(dfout)
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
train=reload_backupfiles('train')
import gc
gc.collect()
Loading and understanding test data
os.chdir(r"/content/drive/My Drive/PHD")
test=pd.read_csv('test.csv')
total_NAs(test)
test.shape
understand_data(test)
Check for continuous date values in test data
print(test.date.unique())
print(len(test.date.unique()))
Memory optimization for test data
locationId_unique = test['locationId'].unique()
locationId_mapping = {locationId: idx for
idx, locationId in
enumerate(locationId_unique, 1)}
test['locationId'] = test['locationId'].map(locationId_mapping)
print(test.locationId.min())
print(test.locationId.max())
print("int8 datatype can handle integer values ranging from {} to {}".format(np.iinfo(np.int8).min,np.iinfo(np.int8).max))
test['locationId'] = test['locationId'].astype("int8")
item_id_unique = test['item_id'].unique()
item_id_mapping = {item_id: idx for
idx, item_id in
enumerate(item_id_unique, 1)}
test['item_id'] = test['item_id'].map(item_id_mapping)
print(test.item_id.min())
print(test.item_id.max())
print("int16 datatype can handle integer values ranging from {} to {}".format(np.iinfo(np.int16).min,np.iinfo(np.int16).max))
test['item_id'] = test['item_id'].astype("int16")
test['onpromotion']=test['onpromotion'].astype('str').astype('category')
Modifying existing features and generating new features out of the existing for test data
process_data(test)
test.dtypes
Dropping unwanted columns
test.drop('date',axis=1,inplace=True)
test.shape
understand_data(test)
Create backup for test data
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('test',test)
Reload backedup data
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
test=reload_backupfiles('test')
import gc
gc.collect()
Loading and understanding item details data
os.chdir(r"/content/drive/My Drive/PHD")
item_details=pd.read_csv('item_details.csv')
total_NAs(item_details)
understand_data(item_details)
Loading and understanding location details data
os.chdir(r"/content/drive/My Drive/PHD")
location_details=pd.read_csv('location_details.csv')
total_NAs(location_details)
understand_data(location_details)
Memory optimization for item_details data
item_id_unique = item_details['item_id'].unique()
item_id_mapping = {item_id: idx for
idx, item_id in
enumerate(item_id_unique, 1)}
item_details['item_id'] = item_details['item_id'].map(item_id_mapping)
print(item_details.item_id.min())
print(item_details.item_id.max())
print("int16 datatype can handle integer values ranging from {} to {}".format(np.iinfo(np.int16).min,np.iinfo(np.int16).max))
item_details['item_id'] = item_details['item_id'].astype("int16")
Changing the column names to to be in consistent with those in train data
location_details.columns=[x.replace('location_id','locationId') for x in location_details.columns]
Memory optimization for location_details data
locationId_unique = location_details['locationId'].unique()
locationId_mapping = {locationId: idx for
idx, locationId in
enumerate(locationId_unique, 1)}
location_details['locationId'] = location_details['locationId'].map(locationId_mapping)
print(location_details.locationId.min())
print(location_details.locationId.max())
print("int8 datatype can handle integer values ranging from {} to {}".format(np.iinfo(np.int8).min,np.iinfo(np.int8).max))
location_details['locationId'] = location_details['locationId'].astype("int8")
understand_data(item_details)
understand_data(location_details)
item_details=reduce_mem_usage(item_details)
location_details=reduce_mem_usage(location_details)
understand_data(item_details)
understand_data(location_details)
Create backups for item_details and location_details files
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('item_details',item_details)
create_backupfiles('location_details',location_details)
Reload backup files
item_details=reload_backupfiles('item_details')
location_details=reload_backupfiles('location_details')
gc.collect()
Joining train, item_details dataframes
merged_data1=train.merge(item_details,how='left',on=['item_id'])
total_NAs(merged_data1)
print(train.shape);print(merged_data1.shape)
print(merged_data1.columns)
merged_data1.head()
Joining train, item_details dataframes and location_details
merged_data1=merged_data1.merge(location_details,how='left',on=['locationId'])
total_NAs(merged_data1)
print(merged_data1.shape)
merged_data1.columns
merged_data1.head()
understand_data(merged_data1)
merged_data1.unit_sales=merged_data1.unit_sales.astype('float32')
merged_data1.dtypes
Taking backup of merged data
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('merged_data1',merged_data1)
Restore backedup merged data
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
merged_data1=reload_backupfiles('merged_data1')
Joining test, item_details dataframes
merged_test=test.merge(item_details,how='left',on=['item_id'])
total_NAs(merged_test)
print(test.shape);
print(merged_test.shape)
print(merged_test.columns)
merged_test.head()
Joining test, item_details dataframes and location_details
merged_test=merged_test.merge(location_details,how='left',on=['locationId'])
total_NAs(merged_test)
print(merged_test.shape)
merged_test.columns
merged_test.head()
understand_data(merged_test)
Creating backup for merged test file
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('merged_test',merged_test)
Restoring backuped merged test file
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
merged_test=reload_backupfiles('merged_test')
merged_test.drop(['id'],axis=1,inplace=True)
Saving id column from test file - to use while generating test predictions
id=pd.DataFrame(test.id)
id.shape
del [[train,test,item_details,location_details]]
gc.collect()
Label Encoding item_id
def setdiff_sorted(array1,array2,assume_unique=False):
ans=np.setdiff1d(array1,array2,assume_unique).tolist()
if assume_unique:
return sorted(ans)
return ans
merged_data1.item_id=merged_data1.item_id.astype('str').astype('category')
merged_test.item_id=merged_test.item_id.astype('str').astype('category')
merged_data1_item_levels=merged_data1['item_id'].cat.categories.tolist()
merged_test_item_levels=merged_test['item_id'].cat.categories.tolist()
new_item_levels=setdiff_sorted(merged_test_item_levels,merged_data1_item_levels)
total_item_levels=merged_data1_item_levels+new_item_levels
print('\nItems in merged_data1 only:\n')
print(len(merged_data1_item_levels))
print('\nItems in merged_test only:\n')
print(len(merged_test_item_levels))
print('\nTotal items in merged_data1 and merged_test are:\n')
print(len(total_item_levels))
Label Encoding locationId column
total_NAs(merged_data1)
encoded_values1=list(range(3828))
dict_items1=dict(zip(total_item_levels,encoded_values1))
merged_data1['item_id']=merged_data1['item_id'].map(dict_items1)
merged_data1.item_id=merged_data1.item_id.astype('int32')
merged_data1.locationId=merged_data1.locationId.astype('str').astype('category')
merged_test.locationId=merged_test.locationId.astype('str').astype('category')
total_location_levels=merged_data1['locationId'].unique().tolist()
encoded_values2=list(range(len(total_location_levels)))
dict_items2=dict(zip(total_location_levels,encoded_values2))
merged_data1['locationId']=merged_data1['locationId'].map(dict_items2)
merged_data1.locationId=merged_data1.locationId.astype('int32')
Label Encoding item_id and locationId columns in merged_testdata
merged_test['item_id']=merged_test['item_id'].map(dict_items1)
merged_test.item_id=merged_test.item_id.astype('int32')
merged_test['locationId']=merged_test['locationId'].map(dict_items2)
merged_test.locationId=merged_test.locationId.astype('int32')
Function to drop unimportant columns
def drop_unimp_cols(data,unimp_cols) :
print("Deleting unimportant columns", unimp_cols)
data.drop(unimp_cols,axis=1,inplace=True)
Understanding important Features
(merged_data1.category_of_item.value_counts(normalize = True)*100).plot(kind = 'bar', figsize = (15,6))
Taking the top 10 categories of "category_of_items" for model building
One hot encoding for multiple cateogircal levels using top 10 levels
# Function to do one-hot encoding for a categorical variable with multiple levels
def one_hot_top_x(data, variable, top_x_labels) :
for label in top_x_labels :
data[variable+'_'+label] = np.where(data[variable]==label, 1, 0)
# Find the 10 most frequent categories for category_of_item
top_10_category_of_item = [x for x in merged_data1.category_of_item.value_counts().sort_values(ascending=False).head(10).index]
# Now create the 10 most frequent dummy variables for category_of_item
one_hot_top_x(merged_data1, 'category_of_item', top_10_category_of_item)
merged_data1.head()
(merged_data1.city.value_counts(normalize = True)*100).plot(kind = 'bar', figsize = (15,6))
(merged_data1.state.value_counts(normalize = True)*100).plot(kind = 'bar', figsize = (15,6))
(merged_data1.cluster.value_counts(normalize = True)*100).plot(kind = 'bar', figsize = (15,6))
Taking the top 10 categories of "clusters" for model building
One hot encoding for multiple cateogircal levels using top 10 levels
# Find the 10 most frequent categories for cluster
merged_data1.cluster=merged_data1.cluster.astype('str')
top_10_cluster = [x for x in merged_data1.cluster.value_counts().sort_values(ascending=False).head(10).index]
# Now create the 10 most frequent dummy variables for cluster
one_hot_top_x(merged_data1, 'cluster', top_10_cluster)
merged_data1.head()
merged_data1.columns
drop_unimp_cols(merged_data1,['cluster','category_of_item','class','city','state'])
understand_data(merged_data1)
import pandas as pd
from typing import List
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
floats = df.select_dtypes(include=['float64']).columns.tolist()
df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
return df
def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
ints = df.select_dtypes(include=['int64']).columns.tolist()
df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
return df
def optimize_objects(df: pd.DataFrame, datetime_features: List[str]) -> pd.DataFrame:
for col in df.select_dtypes(include=['object']):
if col not in datetime_features:
num_unique_values = len(df[col].unique())
num_total_values = len(df[col])
if float(num_unique_values) / num_total_values < 0.5:
df[col] = df[col].astype('category')
else:
df[col] = pd.to_datetime(df[col])
return df
def optimize(df: pd.DataFrame, datetime_features: List[str] = []):
return optimize_floats(optimize_ints(optimize_objects(df, datetime_features)))
print(merged_data1.memory_usage().sum() / 1024**2)
merged_data1=optimize_floats(merged_data1)
merged_data1=optimize_ints(merged_data1)
merged_data1=optimize_objects(merged_data1,[])
merged_data1=optimize(merged_data1)
print(merged_data1.memory_usage().sum() / 1024**2)
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('merged_data1',merged_data1)
merged_data1=reload_backupfiles('merged_data1')
One hot encoding for attributes in test data
One hot encoding for top 10 categorical levels in category_of_item in merged_test data
(merged_test.category_of_item.value_counts(normalize = True)*100).plot(kind = 'bar', figsize = (15,6))
Notice that top 10 most popular items in test and train are the same.
# Now create the 10 most frequent dummy variables for category_of_item
one_hot_top_x(merged_test, 'category_of_item', top_10_category_of_item)
merged_test.head()
One hot encoding with top 10 categories of cluster column
(merged_test.cluster.value_counts(normalize = True)*100).plot(kind = 'bar', figsize = (15,6))
Notice that the top 10 popular clusters in test and train are the same.
# Now create the 10 most frequent dummy variables for cluster
one_hot_top_x(merged_test, 'cluster', top_10_cluster)
merged_test.head()
merged_test.columns
drop_unimp_cols(merged_test,['category_of_item','cluster','class','city','state'])
print(merged_test.memory_usage().sum() / 1024**2)
merged_test=optimize_floats(merged_test)
merged_test=optimize_ints(merged_test)
merged_test=optimize_objects(merged_test,[])
merged_test=optimize(merged_test)
print(merged_test.memory_usage().sum() / 1024**2)
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('merged_test',merged_test)
merged_test=reload_backupfiles('merged_test')
print("======verify that the no of columns are the same in train,val and test after dummification======")
print(merged_data1.shape[1])
print(merged_test.shape[1])
Before dummification except for the target column, all other columns are present train are present in test.
print(merged_data1.columns)
print(merged_test.columns)
# Renaming test file
test=merged_test.copy()
Train Validation split
X=merged_data1.drop(['unit_sales'],axis=1)
y=pd.DataFrame(merged_data1['unit_sales'])
print(X.columns);print(y.columns)
# Split train and validation data
from sklearn.model_selection import train_test_split
train_x,val_x,train_y,val_y=train_test_split(X,y,test_size=0.2,random_state=42)
print(train_x.shape)
print(val_x.shape)
print(train_y.shape)
print(val_y.shape)
#def train_test_split(X,y):
# len=X.shape[0]
# train_size = int((len) * 0.7)
# train_x, val_x = X[:train_size], X[train_size:]
# train_y, val_y = y[:train_size], y[train_size:]
# return (train_x,val_x,train_y,val_y)
#train_x,val_x,train_y,val_y=train_test_split(X,y)
train_x.columns
test.columns
del [[merged_data1,merged_test,X,y]]
gc.collect()
Dummifying other categorical variables
cat_cols=['onpromotion','day','month','year','type']
train_x=pd.get_dummies(train_x,columns=cat_cols,drop_first=False)
val_x=pd.get_dummies(val_x,columns=cat_cols,drop_first=False)
test=pd.get_dummies(test,columns=cat_cols,drop_first=False)
print("======verify that the no of columns are the same in train,val and test after dummification======")
print(train_x.shape[1])
print(val_x.shape[1])
print(test.shape[1])
Checking Alignment
set(train_x.columns.tolist()) - set(val_x.columns.tolist())
set(val_x.columns.tolist()) - set(train_x.columns.tolist())
set(train_x.columns.tolist()) - set(test.columns.tolist())
set(test.columns.tolist()) - set(train_x.columns.tolist())
train_x_aligned, test_aligned = train_x.align(test, join = 'outer', axis = 1, fill_value = 0)
print(train_x_aligned.shape)
print(test_aligned.shape)
train_x_aligned, val_x_aligned = train_x_aligned.align(val_x, join = 'outer', axis = 1, fill_value = 0)
print(train_x_aligned.shape)
print(val_x_aligned.shape)
print("======verify that the no of columns are the same in train,val and test after alignment======")
print(train_x_aligned.shape[1])
print(val_x_aligned.shape[1])
print(test_aligned.shape[1])
del [[train_x,val_x,test]]
del [[top_10_cluster,top_10_category_of_item,merged_data1_item_levels,merged_test_item_levels,total_location_levels,total_item_levels]]
gc.collect()
# Renaming
train_x=train_x_aligned
val_x=val_x_aligned
test=test_aligned
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('train_x',train_x)
create_backupfiles('train_y',train_y)
create_backupfiles('val_x',val_x)
create_backupfiles('val_y',val_y)
create_backupfiles('test',test)
train_x=reload_backupfiles('train_x')
train_y=reload_backupfiles('train_y')
val_x=reload_backupfiles('val_x')
val_y=reload_backupfiles('val_y')
test=reload_backupfiles('test')
del [[train_x_aligned,val_x_aligned,test_aligned]]
gc.collect()
Target Distribution Analysis
plt.figure(figsize = (15,6))
sns.boxplot(train_y['unit_sales'])
train_y[train_y.unit_sales>4000]
train_y[train_y.unit_sales<-300]
# Function to get Upper and Lower Quantiles for all the numerical columns of a dataset
def getUpperLowerQuantiles(data) :
LowerQuantile=[]
UpperQuantile=[]
num_cols=data.select_dtypes(include=['int64','float64']).columns
print(num_cols)
for i in num_cols:
LowerQuantile.append(data[i].quantile(.01))
UpperQuantile.append(data[i].quantile(.99))
return(LowerQuantile,UpperQuantile)
# Get the Upper and Lower Quantiles
#LowerQuantile,UpperQuantile=list(getUpperLowerQuantiles(train_y))
# Function to do quantile clipping incase of any outliers
def clip_outliers(data):
data.clip(lower=LowerQuantile, axis=1, inplace=True)
data.clip(upper=UpperQuantile, axis=1, inplace=True)
# Clip Outliers (if any) in train data
#clip_outliers(train_y)
# Clip Outliers (if any) in validation data
#clip_outliers(val_y)
There are few extreme values, retaining there as they are not many.
train_y_abs=pd.DataFrame(np.abs(train_y),columns=['unit_sales'])
sns.distplot(np.log(abs(train_y['unit_sales'])),bins=10,kde=True)
train_xy=pd.concat([train_x,train_y],axis=1)
val_xy=pd.concat([val_x,val_y],axis=1)
plt.figure(figsize = (20,12))
plt.subplots_adjust(wspace=0.3,hspace=0.7)
plt.subplot(2,2,1)
sns.distplot(np.log1p(train_xy[train_xy.type_A == 1]['unit_sales']+1806))
plt.title('A')
plt.subplot(2,2,2)
sns.distplot(np.log1p(train_xy[train_xy.type_B == 1]['unit_sales']+1806))
plt.title('B')
plt.subplot(2,2,3)
sns.distplot(np.log1p(train_xy[train_xy.type_C == 1]['unit_sales']+1806))
plt.title('C')
plt.subplot(2,2,4)
sns.distplot(np.log1p(train_xy[train_xy.type_D == 1]['unit_sales']+1806))
plt.title('D')
plt.subplot(2,2,4)
sns.distplot(np.log1p(train_xy[train_xy.type_E == 1]['unit_sales']+1806))
plt.title('E')
plt.show()
Function to evaluate model and calculate Erorr Metrics
def ErrorMetrics(train_y,train_pred,val_y,val_pred):
print("-----Train Error------")
print("MSE:", mean_squared_error(train_y, train_pred))
print("RMSE:", np.sqrt(mean_squared_error(train_y, train_pred)))
print("MAE:", mean_absolute_error(train_y, train_pred))
print ('MAPE: ',np.mean(np.abs((train_y - train_pred)/train_y))*100)
print("-----Validation Error------")
print("MSE:", mean_squared_error(val_y, val_pred))
print("RMSE:", np.sqrt(mean_squared_error(val_y, val_pred)))
print("MAE:", mean_absolute_error(val_y, val_pred))
print ('MAPE: ',np.mean(np.abs((val_y - val_pred)/val_y))*100)
Linear Regression Model Building
from sklearn.metrics import mean_absolute_error,mean_squared_error
def model_building(model):
m=model
m.fit(train_x,train_y)
train_pred = m.predict(train_x)
val_pred = m.predict(val_x)
print('=======Train======')
print('MSE: ', mean_squared_error(train_y, train_pred))
print('RMSE: ', np.sqrt(mean_squared_error(train_y, train_pred)))
print('MAE: ', mean_absolute_error(train_y,train_pred))
print('MAPE: ', np.mean(np.abs((train_y - train_pred)/train_y))*100)
print('======Test======')
print('MSE: ', mean_squared_error(val_y, val_pred))
print('RMSE: ', np.sqrt(mean_squared_error(val_y, val_pred)))
print('MAE: ', mean_absolute_error(val_y,val_pred))
print ('MAPE: ',np.mean(np.abs((val_y - val_pred)/val_y))*100)
return m
from sklearn.linear_model import LinearRegression
reg = model_building(LinearRegression())
from mlxtend.plotting import plot_learning_curves
plot_learning_curves(train_x, train_y, val_x, val_y, reg, scoring='mean_absolute_error')
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('reg',reg)
reg=reload_backupfiles('reg')
Ridge Regularizatoin
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
# The value of alpha determines the extent of penalization.
# But we also need to check which value of alpha gives best predictions on test data.
# For this we experiment with several values of alpha and pick the best.
# We do this by performing grid search over several values of alpha - Cross Validation
alphas = np.array([1,0.1,0.01,0.001,0.0001,0,1.5,2])
# create and fit a ridge regression model
model_ridge = Ridge()
grid = GridSearchCV(estimator=model_ridge, param_grid=dict(alpha=alphas),cv=10)
grid.fit(train_x,train_y)
print(grid)
print(grid.best_score_)
print(grid.best_estimator_.alpha)
Ridge_model= Ridge(alpha=2,normalize=False)
Ridge_model.fit(train_x,train_y)
train_pred_ridge=Ridge_model.predict(train_x)
val_pred_ridge=Ridge_model.predict(val_x)
train_pred_ridge=pd.DataFrame(train_pred_ridge)
val_pred_ridge=pd.DataFrame(val_pred_ridge)
ErrorMetrics(train_y.values,train_pred_ridge,val_y.values,val_pred_ridge)
Lasso Regularization
# create and fit a lasso regression model
model_lasso = Lasso()
grid = GridSearchCV(estimator=model_lasso, param_grid=dict(alpha=alphas),cv=5)
grid.fit(train_x,train_y)
print(grid)
print(grid.best_score_)
print(grid.best_estimator_.alpha)
Lasso_model= Lasso(alpha=0.01,normalize=False)
Lasso_model.fit(train_x,train_y)
train_pred_lasso=Lasso_model.predict(train_x)
val_pred_lasso=Lasso_model.predict(val_x)
train_pred_lasso=pd.DataFrame(train_pred_lasso)
val_pred_lasso=pd.DataFrame(val_pred_lasso)
ErrorMetrics(train_y.values,train_pred_lasso,val_y.values,val_pred_lasso)
Decission Tree
# Build DT Model
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
dt_reg1 = DecisionTreeRegressor(max_depth=15,criterion='mse')
# Fit the model on train data
%time dt_reg1.fit(train_x,train_y)
print(dt_reg1.score)
# Predict target on train and val data
train_pred = dt_reg1.predict(train_x)
val_pred = dt_reg1.predict(val_x)
# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)
Plotting feature importance
features = pd.DataFrame(dt_reg1.feature_importances_, columns = ['FI'], index = train_x.columns).sort_values('FI', ascending = True)
features.plot(kind = 'barh', figsize = (15,10))
del features
Plotting Learning Curves
from mlxtend.plotting import plot_learning_curves
plot_learning_curves(train_x, train_y, val_x, val_y, dt_reg1, scoring='mean_absolute_error')
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('dt_reg1',dt_reg1)
dt_reg1=reload_backupfiles('dt_reg1')
Random Forest
# Build Model
from sklearn.ensemble import RandomForestRegressor
rf1 = RandomForestRegressor(random_state=3,max_depth=6,min_samples_leaf=3)
# Fit model on train data
%time rf1.fit(train_x,train_y)
print(rf1.score)
# Predict target on train and val data
train_pred = rf1.predict(train_x)
val_pred = rf1.predict(val_x)
# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)
Plotting important features
features = pd.DataFrame(rf1.feature_importances_, columns = ['FI'], index = train_x.columns).sort_values('FI', ascending = True)
features.plot(kind = 'barh', figsize = (15,10))
del features
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('rf1',rf1)
rf1=reload_backupfiles('rf1')
Gradient Boosting Regressor
# Build Model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
gb= GradientBoostingRegressor(n_estimators=200,learning_rate=0.002,max_depth=6,max_leaf_nodes=15)
# Fit model on train data
%time gb.fit(train_x,train_y)
print(gb.score)
# Predict target on train and val data
train_pred = gb.predict(train_x)
val_pred = gb.predict(val_x)
# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)
XG Boost
from xgboost.sklearn import XGBRegressor
xgb1 = XGBRegressor(random_state=6,n_estimators=100,learning_rate=0.001)
# Fit model on train data
%time xgb1.fit(train_x,train_y)
print(xgb1.score)
# Predict target on train and val data
train_pred = xgb1.predict(train_x)
val_pred = xgb1.predict(val_x)
# Evaluate the model on train and val
train_pred=pd.DataFrame(train_pred)
val_pred=pd.DataFrame(val_pred)
ErrorMetrics(train_y.values,train_pred,val_y.values,val_pred)
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('xgb1',xgb1)
xgb1=reload_backupfiles('xgb1')
Pre-processing for Neural Networks
Functions to reduce memory consumption
def reduce_mem_usage(df):
""" iterate through all the columns of a dataframe and modify the data type
to reduce memory usage.
"""
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if col_type == 'datetime64[ns]':
print ("")
elif str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
else:
df[col] = df[col].astype('category')
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
Loading the train data
os.chdir(r"/content/drive/My Drive/PHD")
train=pd.read_csv('train.csv')
Working on last one year data
train=train[(train['date'] >= '2018-01-01') & (train['date'] <= '2018-12-31')]
For any time series data we should have continuous data.
Since we took one year data of 2018- the expectation is to have 365 dates
print(len(train.date.unique()))
Notice that there is a missing date
Let us impute values for this date
import datetime
from datetime import date, timedelta
#add missing date
min_date = min(train.date)
max_date = max(train.date)
calendar = []
d1 = datetime.datetime.strptime(min_date, '%Y-%m-%d')
d2 = datetime.datetime.strptime(max_date, '%Y-%m-%d')
delta = d2 - d1 # timedelta
for i in range(delta.days + 1):
calendar.append(datetime.date.strftime(d1 + timedelta(days=i), '%Y-%m-%d'))
calendar = pd.DataFrame({'date':calendar})
train = calendar.merge(train, left_on='date', right_on='date', how='left')
Now check that the missing date gets imputed
print(len(train.date.unique()))
train.shape
train.head()
Modifying existing columns and generating new columns
import datetime
def process_data(data):
data.date=pd.to_datetime(data['date'])
data['day'] = data.date.dt.day_name()
data['month'] = data.date.dt.strftime('%B')
data['year'] = data.date.dt.year
process_data(train)
train.head()
Droppping unwanted columns
train.drop(['date'],axis=1,inplace=True)
train.dtypes
Understanding train data
def understand_data(data) :
return(pd.DataFrame({"Datatype":data.dtypes,"No of NAs":data.isna().sum(),"No of Levels":data.apply(lambda x: x.nunique(),axis=0),"Levels":data.apply(lambda x: str(x.unique()),axis=0)}))
understand_data(train)
Null Value Handling
def total_NAs(data) :
print ("Total NAs:",data.isna().sum().sum())
total_NAs(train)
train.isna().sum()
For the date we have imputed all other columns are missing, let us impute values for them too
#Check index to apply the formula
na_index_train = train[train['unit_sales'].isnull() == True].index.values
#Define the index to use to apply the formala
na_index_train_plus = na_index_train.copy()
na_index_train_minus = np.maximum(0, na_index_train-1)
for i in range(len(na_index_train)):
k = 1
while (na_index_train[min(i+k,len(na_index_train)-1)] == na_index_train[i]+k):
k += 1
na_index_train_plus[i] = min(len(train)-1, na_index_train_plus[i] + k )
#Apply the formula
for i in range(len(na_index_train)):
if (na_index_train[i] == 0):
train.loc[na_index_train[i], 'unit_sales'] = train.loc[na_index_train_plus[i], 'unit_sales']
elif (na_index_train[i] == len(train)):
train.loc[na_index_train[i], 'unit_sales'] = train.loc[na_index_train_minus[i], 'unit_sales']
else:
train.loc[na_index_train[i], 'unit_sales'] = (train.loc[na_index_train_plus[i], 'unit_sales'] + train.loc[na_index_train_minus[i], 'unit_sales'])/ 2
Impute values for onpromotion as False
train.onpromotion.fillna(value=False,inplace=True)
print(train.isnull().sum(), '\n')
Call memory optimaztion function to reduce file size
train = reduce_mem_usage(train)
Checking Summary statistics
train.describe(include='all')
Taking text backup and storing backedup files in the created folder path
def create_backupfiles(filename,dfin):
outfile=open(filename,'wb')
pkl.dump(dfin,outfile)
outfile.close()
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('train',train)
Verifty whether files are created or not
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
print(os.getcwd())
!ls -lrt
Reload backedup data
def reload_backupfiles(filename):
infile=open(filename,'rb')
dfout=pkl.load(infile)
infile.close()
return(dfout)
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
train=reload_backupfiles('train')
import gc
gc.collect()
Loading and Understanding test data
os.chdir(r"/content/drive/My Drive/PHD")
test=pd.read_csv('test.csv')
total_NAs(test)
test.shape
understand_data(test)
Modifying existing features and generating new features out of the existing for test data
process_data(test)
test.dtypes
Check for continuous date values in test data
print(test.date.unique())
print(len(test.date.unique()))
test.drop('date',axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)
Memory optimization for test data
test=reduce_mem_usage(test)
test.shape
understand_data(test)
Create backup for test data
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
create_backupfiles('test',test)
Reload backuped data
os.chdir(r"/content/drive/My Drive/PHD/MemoryReducedFiles")
test=reload_backupfiles('test')
import gc
gc.collect()
Neural Networks Model Building
def convert_other_categ(data,other_categ_cols) :
for i in other_categ_cols :
data[other_categ_cols]=data[other_categ_cols].astype('str').astype('category')
print (data.select_dtypes(include='category').columns, "columns are converted to categorical")
cat_cols=['day','month','year','onpromotion','item_id','locationId']
convert_other_categ(train,cat_cols)
convert_other_categ(test,cat_cols)
train.unit_sales=train.unit_sales.astype('float32')
print(train.dtypes)
print(test.dtypes)
categorical=train.select_dtypes('category').columns
categorical=categorical.drop(['locationId','item_id'])
categorical
train.columns
train_cat,val_cat,\
train_locationId,val_locationId,\
train_itemId,val_itemId,\
train_target,val_target= train_test_split(train[categorical],train['locationId'],
train['item_id'],train['unit_sales'],
test_size=0.2,random_state=1234)
print(train_cat.shape)
print("\n")
print(val_cat.shape)
print("\n")
print(train_locationId.shape)
print("\n")
print(val_locationId.shape)
print("\n")
print(train_itemId.shape)
print("\n")
print(val_itemId.shape)
print("\n")
print(train_target.shape)
print("\n")
print(val_target.shape)
print("\n")
test_cat,test_locationId,test_itemId=test[categorical],test['locationId'],test['item_id']
print(test_cat.shape)
print("\n")
print(test_locationId.shape)
print("\n")
print(test_itemId.shape)
print("\n")
Dealing with target variable
Reshape(-1,1) is used if our data has a single feature/column. It gives a new shape without changing the data.
y_train=train_target.get_values().reshape(-1, 1)
y_val=val_target.get_values().reshape(-1, 1)
Dealing with other categorical variables (which are not processed through categorical embeddings)
train_cat.head(5)
train_cat.columns
ohe=OneHotEncoder(handle_unknown='ignore',)
ohe_fit=ohe.fit(train_cat)
train_cate=ohe_fit.transform(train_cat).toarray()
train_cate
val_cate=ohe_fit.transform(val_cat).toarray()
val_cate
test_cate=ohe_fit.transform(test_cat).toarray()
test_cate
X_traindata=train_cate
X_valdata=val_cate
X_testdata=test_cate
print(X_traindata.shape)
print(X_valdata.shape)
print(X_testdata.shape)
Dealing with categ variables with multiple levels using categorical embeddings
Encoding item_id
def setdiff_sorted(array1,array2,assume_unique=False):
ans=np.setdiff1d(array1,array2,assume_unique).tolist()
if assume_unique:
return sorted(ans)
return ans
train_item_levels=train['item_id'].cat.categories.tolist()
test_item_levels=test['item_id'].cat.categories.tolist()
new_item_levels=setdiff_sorted(test_item_levels,train_item_levels)
total_item_levels=train_item_levels+new_item_levels
print('\nItems in train:\n')
print(len(train_item_levels))
print('\nItems in test:\n')
print(len(test_item_levels))
print('\nItems in test only:\n')
print(len(new_item_levels))
print('\nTotal items in train and test are:\n')
print(len(total_item_levels))
itemId_enc=LabelEncoder().fit(np.asarray(total_item_levels))
train_itemId_enc=itemId_enc.transform(train_itemId)
val_itemId_enc=itemId_enc.transform(val_itemId)
test_itemId_enc=itemId_enc.transform(test_itemId)
Encoding locationId
locationId_enc=LabelEncoder().fit(train['locationId'])
train_locationId_enc=locationId_enc.transform(train_locationId)
val_locationId_enc=locationId_enc.transform(val_locationId)
test_locationId_enc=locationId_enc.transform(test_locationId)
Identifying the number of nodes needed in the input layer
location_unique_id= np.size(np.unique(train['locationId']))
location_unique_id
item_unique_id= np.size(np.unique(total_item_levels))
item_unique_id
Creating Categorical Embeddings
itemId_input = Input(shape=(1, ), name="itemId")
itemId_embed = Embedding(input_dim=item_unique_id, output_dim=50)(itemId_input)
itemId_embed_flat = Flatten()(itemId_embed)
cat_inputs = Input(shape=(X_traindata.shape[1],),name='cat_inputs')
out_cat = Dense(64, activation='relu')(cat_inputs)
locationId_input = Input(shape=(1, ), name="locationId")
locationId_embed = Embedding(input_dim=location_unique_id, output_dim=50)(locationId_input)
locationId_embed_flat = Flatten()(locationId_embed)
cat_inputs = Input(shape=(X_traindata.shape[1],),name='cat_inputs')
out_cat = Dense(64, activation='relu')(cat_inputs)
NN Model 1: No regularization, Drop outs and weights initiliation.
Concating categorical embeddings with other numerical/categorical data and target.
concatenated = concatenate([itemId_embed_flat,locationId_embed_flat,out_cat],axis=-1)
X = Dense(8, activation='relu')(concatenated)
final_out = Dense(1, activation='linear')(X)
Create the model by passing the inputs - categorical embeddings, other features
model = Model(inputs=[itemId_input,locationId_input,cat_inputs], outputs=final_out)
Check the model summary
model.summary()
Compile the model by specifying the loss function, optimizer and metrics
model.compile(loss='mape', optimizer='adam', metrics=['mape'])
Fit the model on train predictors and target.
from keras.callbacks import ModelCheckpoint
check2=ModelCheckpoint("weights.{epoch:02d}-{val_loss:.2f}.hdf5",save_best_only=True,
monitor='val_loss',mode='min')
model.fit([train_itemId_enc,train_locationId_enc,X_traindata], y=y_train,
callbacks=[check2],epochs=50,batch_size=200000,validation_split=0.2)
Evaluate the model performance on train and validation data
model.evaluate([train_itemId_enc,train_locationId_enc,X_traindata], y=y_train)
model.evaluate([val_itemId_enc,val_locationId_enc,X_valdata], y=y_val)
Make predictions on train and validation data
pred_train=model.predict([train_itemId_enc,train_locationId_enc,X_traindata])
pred_train[1:10]
pred_val=model.predict([val_itemId_enc,val_locationId_enc,X_valdata])
pred_val[1:10]
Compute train and validation Errors
def ErrorMetrics(train_y,train_pred,val_y,val_pred):
print("-----Train Error------")
print("SSE:", mean_squared_error(train_y, train_pred))
print("RMSE:", np.sqrt(mean_squared_error(train_y, train_pred)))
print("MAE:", mean_absolute_error(train_y, train_pred))
print ('MAPE: ',np.mean(np.abs((train_y - train_pred)/train_y))*100)
print("-----Validation Error------")
print("SSE:", mean_squared_error(val_y, val_pred))
print("RMSE:", np.sqrt(mean_squared_error(val_y, val_pred)))
print("MAE:", mean_absolute_error(val_y, val_pred))
print ('MAPE: ',np.mean(np.abs((val_y - val_pred)/val_y))*100)
ErrorMetrics(y_train,pred_train,y_val,pred_val)
pred_test=model.predict([test_itemId_enc,test_locationId_enc,X_testdata])
pred_test[1:10]
NN Model 2: Adding regularization techniques and attempting
Concating categorical embeddings with other numerical/categorical data and target
concatenated = concatenate([itemId_embed_flat,locationId_embed_flat,out_cat],axis=-1)
D1 = Dropout(0.2) (concatenated)
X = Dense(8, activation='relu',kernel_initializer='normal',kernel_regularizer=regularizers.l1(0.01))(D1)
D2 = Dropout(0.4) (X)
out_inter = Dense(4, activation='relu',kernel_initializer='normal') (D2)
final_out = Dense(1, activation='linear') (out_inter)
Create the model by passing the inputs - categorical embeddings, other features
model = Model(inputs=[itemId_input,locationId_input,cat_inputs], outputs=final_out)
Check the model summary
model.summary()
Compile the model by specifying the loss function, optimizer and metrics
model.compile(loss='mape', optimizer='adam', metrics=['mse'])
Fit the model on train predictors and target.
from keras.callbacks import ModelCheckpoint
check2=ModelCheckpoint("weights.{epoch:02d}-{val_loss:.2f}.hdf5",save_best_only=True,
monitor='val_loss',mode='min')
model.fit([train_itemId_enc,train_locationId_enc,X_traindata], y=y_train,
callbacks=[check2],epochs=50,batch_size=200000,validation_split=0.2)
NN Model3: Clipping negative records with minimum value and applying log transformation
print("total no of records in train are:\n")
print(train.shape[0])
print("\nout of which records with negative sales are:\n")
print(train[train.unit_sales<0].shape[0])
print("\nPercertange of records with negative sales out of the total is:\n")
print(((train[train.unit_sales<0].shape[0])/train.shape[0])*100)
Let us check the distribution of target when there are no negative records
train_no_negsales=train[train.unit_sales>0]
train_no_negsales.shape
sns.distplot(np.log(train_no_negsales['unit_sales']),bins=10,kde=True)
At this point since we are can't certainly determine whether unit_sales are negative due to returns or mis entered, instead of changing them to positive sales, which might have more impact, we shall update their unit_sales to a very less value(say 0.00001)
We don't want to drop those records all together because we don't want to have any missing dates.
train['unit_sales']=train['unit_sales'].apply(lambda x: np.log1p(x) if x>0 else 0.00001)
sns.distplot(train['unit_sales'],bins=10,kde=True)
Neural Networks Model Building
def convert_other_categ(data,other_categ_cols) :
for i in other_categ_cols :
data[other_categ_cols]=data[other_categ_cols].astype('str').astype('category')
print (data.select_dtypes(include='category').columns, "columns are converted to categorical")
cat_cols=['day','month','year','onpromotion','item_id','locationId']
convert_other_categ(train,cat_cols)
convert_other_categ(test,cat_cols)
train.unit_sales=train.unit_sales.astype('float32')
print(train.dtypes)
print(test.dtypes)
categorical=train.select_dtypes('category').columns
categorical=categorical.drop(['locationId','item_id'])
categorical
train.columns
train_cat,val_cat,\
train_locationId,val_locationId,\
train_itemId,val_itemId,\
train_target,val_target= train_test_split(train[categorical],train['locationId'],
train['item_id'],train['unit_sales'],
test_size=0.2,random_state=1234)
print(train_cat.shape)
print("\n")
print(val_cat.shape)
print("\n")
print(train_locationId.shape)
print("\n")
print(val_locationId.shape)
print("\n")
print(train_itemId.shape)
print("\n")
print(val_itemId.shape)
print("\n")
print(train_target.shape)
print("\n")
print(val_target.shape)
print("\n")
test_cat,test_locationId,test_itemId=train[categorical],train['locationId'],train['item_id']
print(test_cat.shape)
print("\n")
print(test_locationId.shape)
print("\n")
print(test_itemId.shape)
print("\n")
Dealing with target variable
Reshape(-1,1) is used if our data has a single feature/column. It gives a new shape without changing the data.
y_train=train_target.get_values().reshape(-1, 1)
y_val=val_target.get_values().reshape(-1, 1)
Dealing with other categorical variables (which are not processed through categorical embeddings)
train_cat.head(5)
train_cat.columns
ohe=OneHotEncoder(handle_unknown='ignore',)
ohe_fit=ohe.fit(train_cat)
train_cate=ohe_fit.transform(train_cat).toarray()
train_cate
val_cate=ohe_fit.transform(val_cat).toarray()
val_cate
test_cate=ohe_fit.transform(test_cat).toarray()
test_cate
X_traindata=train_cate
X_valdata=val_cate
X_testdata=test_cate
print(X_traindata.shape)
print(X_valdata.shape)
print(X_testdata.shape)
Dealing with categ variables with multiple levels using categorical embeddings
Encoding item_id
def setdiff_sorted(array1,array2,assume_unique=False):
ans=np.setdiff1d(array1,array2,assume_unique).tolist()
if assume_unique:
return sorted(ans)
return ans
train_item_levels=train['item_id'].cat.categories.tolist()
test_item_levels=test['item_id'].cat.categories.tolist()
new_item_levels=setdiff_sorted(test_item_levels,train_item_levels)
total_item_levels=train_item_levels+new_item_levels
print('\nItems in train:\n')
print(len(train_item_levels))
print('\nItems in test:\n')
print(len(test_item_levels))
print('\nItems in test only:\n')
print(len(new_item_levels))
print('\nTotal items in train and test are:\n')
print(len(total_item_levels))
itemId_enc=LabelEncoder().fit(np.asarray(total_item_levels))
train_itemId_enc=itemId_enc.transform(train_itemId)
val_itemId_enc=itemId_enc.transform(val_itemId)
test_itemId_enc=itemId_enc.transform(test_itemId)
Encoding locationId
locationId_enc=LabelEncoder().fit(train['locationId'])
train_locationId_enc=locationId_enc.transform(train_locationId)
val_locationId_enc=locationId_enc.transform(val_locationId)
test_locationId_enc=locationId_enc.transform(test_locationId)
Identifying the number of nodes needed in the input layer
location_unique_id= np.size(np.unique(train['locationId']))
location_unique_id
item_unique_id= np.size(np.unique(total_item_levels))
item_unique_id
Creating Categorical Embeddings
itemId_input = Input(shape=(1, ), name="itemId")
itemId_embed = Embedding(input_dim=item_unique_id, output_dim=50)(itemId_input)
itemId_embed_flat = Flatten()(itemId_embed)
cat_inputs = Input(shape=(X_traindata.shape[1],),name='cat_inputs')
out_cat = Dense(64, activation='relu')(cat_inputs)
locationId_input = Input(shape=(1, ), name="locationId")
locationId_embed = Embedding(input_dim=location_unique_id, output_dim=50)(locationId_input)
locationId_embed_flat = Flatten()(locationId_embed)
cat_inputs = Input(shape=(X_traindata.shape[1],),name='cat_inputs')
out_cat = Dense(64, activation='relu')(cat_inputs)
Concating categorical embeddings with other numerical/categorical data and target
concatenated = concatenate([itemId_embed_flat,locationId_embed_flat,out_cat],axis=-1)
X = Dense(8, activation='relu')(concatenated)
final_out = Dense(1, activation='linear')(X)
Create the model by passing the inputs - categorical embeddings, other features
model = Model(inputs=[itemId_input,locationId_input,cat_inputs], outputs=final_out)
Check the model summary
model.summary()
Compile the model by specifying the loss function, optimizer and metrics
model.compile(loss='mape', optimizer='adam', metrics=['mape'])
Fit the model on train predictors and target.
from keras.callbacks import ModelCheckpoint
check2=ModelCheckpoint("weights.{epoch:02d}-{val_loss:.2f}.hdf5",save_best_only=True,
monitor='val_loss',mode='min')
model.fit([train_itemId_enc,train_locationId_enc,X_traindata], y=y_val,
callbacks=[check2],epochs=50,batch_size=500000,validation_split=0.2)
Evaluate the model performance on train and validation data
model.evaluate([train_itemId_enc,train_locationId_enc,X_traindata], y=train_target)
model.evaluate([val_itemId_enc,val_locationId_enc,X_valdata], y=val_target)
Make predictions on train and validation data
pred_train=model.predict([train_itemId_enc,train_locationId_enc,X_traindata])
pred_train[1:10]
pred_val=model.predict([val_itemId_enc,val_locationId_enc,X_valdata])
pred_val[1:10]
Compute train and validation Errors
def ErrorMetrics(train_y,train_pred,val_y,val_pred):
print("-----Train Error------")
print("SSE:", mean_squared_error(train_y, train_pred))
print("RMSE:", np.sqrt(mean_squared_error(train_y, train_pred)))
print("MAE:", mean_absolute_error(train_y, train_pred))
print ('MAPE: ',np.mean(np.abs((train_y - train_pred)/train_y))*100)
print("-----Validation Error------")
print("SSE:", mean_squared_error(val_y, val_pred))
print("RMSE:", np.sqrt(mean_squared_error(val_y, val_pred)))
print("MAE:", mean_absolute_error(val_y, val_pred))
print ('MAPE: ',np.mean(np.abs((val_y - val_pred)/val_y))*100)
ErrorMetrics(y_train,pred_train,y_val,pred_val)
Forecasting unit sales for test data by using model Neural Networks model which got better results
pred_test=model.predict([test_itemId_enc,test_locationId_enc,X_testdata])
pred_test[1:10]
Target=pd.DataFrame(pred_test,columns=['unit_sales'])
Target.shape
os.chdir(r"/content/drive/My Drive/PHD")
test_backup=pd.read_csv('test.csv')
id=test_backup.id
NN_Predictions_best=pd.concat([id,Target],axis=1)
create_backupfiles('NN_Predictions_best',NN_Predictions_best)
NN_Predictions_best=reload_backupfiles('NN_Predictions_best')
NN_Predictions_best
NN_Predictions_best.to_csv('NeuralNetowrk_BestPredictions.csv')
Importing the Required Libraries
import os
import numpy as np
import pandas as pd
import shutil
import pickle as pkl
import matplotlib.pyplot as plt
import seaborn as sns
# To filter/truncate warnings
import warnings
warnings.filterwarnings('ignore')
Changing the working directory and mount the drive and upload the train and test data
os.chdir(r"C:\Users\pdrva\Desktop\Divya\INSOFE\PHD")
test=pd.read_csv("test.csv")
os.chdir(r"C:\Users\pdrva\Desktop\Divya\INSOFE\PHD\MemoryReducedFiles - January\Predictions")
NN_predictions=pd.read_csv("NeuralNetwork_BestPredictions.csv")
print(test.shape)
print(NN_predictions.shape)
test.columns
NN_predictions.columns
test_output=test.merge(NN_predictions,how='outer',on=['id'])
test_output.head()
print(test.shape)
print(test_output.shape)
os.chdir(r"C:\Users\pdrva\Desktop\Divya\INSOFE\PHD")
item_details=pd.read_csv("item_details.csv")
location_details=pd.read_csv("location_details.csv")
merged_test=test_output.merge(item_details,how='left',on=['item_id'])
location_details.columns=[x.replace('location_id','locationId') for x in location_details.columns]
merged_test=merged_test.merge(location_details,how='left',on=['locationId'])
merged_test.shape
merged_test.head()
merged_test.columns
merged_test.drop(['id'],axis=1,inplace=True)
merged_test.groupby(['type','cluster']).agg({'unit_sales':'sum'}).sort_values(by='unit_sales',ascending=False)
q1=(merged_test.groupby(['type','cluster']).agg({'unit_sales':'sum'}).reset_index())
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,7))
sns.barplot(q1.type,q1.unit_sales,hue=q1.cluster)
plt.xlabel('Business Unit')
plt.xticks(rotation='vertical')
plt.ylabel('unit_sales',)
plt.title('Business Units of which cluster will record highest sales in first 15 days of Jan 2019?')
plt.show()
(merged_test.groupby(['type','cluster']).agg({'unit_sales':'sum'}).sort_values(by='unit_sales',ascending=False)).plot(kind='bar')
q2=merged_test[merged_test.cluster==8].groupby('category_of_item').agg({'unit_sales':'sum'}).sort_values(by='unit_sales',ascending=False).head(10)
q2
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,7))
sns.barplot(q2.index,q2.unit_sales)
plt.xlabel('Category of item')
plt.xticks(rotation='vertical')
plt.ylabel('category_of_item')
plt.title('What are the top 10 selling item categories in this cluster?')
plt.show()
q2=merged_test[merged_test.cluster==8].groupby('item_id').agg({'unit_sales':'sum'}).sort_values(by='unit_sales',ascending=False).head(10)
q2
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,7))
sns.barplot(q2.index,q2.unit_sales)
plt.xlabel('Category of item')
plt.xticks(rotation='vertical')
plt.ylabel('unit_sales',)
plt.title('What are the top 10 selling item categories in this cluster?')
plt.show()
data=pd.DataFrame()
top_item_list=q2.index.to_list()
print(top_item_list)
for i in top_item_list:
data=data.append(pd.DataFrame(merged_test[merged_test.item_id==i]))
data.shape
data.head(5)
q3=data.groupby(['item_id']).agg({'unit_sales':'sum'})
q3.sort_values(by='unit_sales',ascending=False)
weekly_rateof_purchase=q3/2
weekly_rateof_purchase=weekly_rateof_purchase.sort_values(by='unit_sales',ascending=False)
weekly_rateof_purchase
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(12,7))
sns.barplot(weekly_rateof_purchase.index,weekly_rateof_purchase.unit_sales)
plt.xlabel('item')
plt.xticks(rotation='vertical')
plt.ylabel('unit_sales')
plt.title('Rate of purchase per week for top 10 selling items in cluster 8?')
plt.show()
Model Building using Time Series models
import itertools
p = q = range(0, 4)
d =range(0,2)
pdq = list(itertools.product(p,d,q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))
train_last_twoyears=pd.DataFrame(train[((train.year==2017) | (train.year==2018))])
train_last_twoyears.shape
train_last_twoyears.columns
train_last_twoyears=(train_last_twoyears.groupby('month_years')['unit_sales'].sum().reset_index())
train_last_twoyears=train_last_twoyears.set_index('month_years')
train_last_twoyears.head()
for param in pdq:
for param_seasonal in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(train_last_twoyears,
order=param,
seasonal_order=param_seasonal,
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
except:
continue
ARIMA(1, 1, 2)x(0, 1, 0, 12)12 - AIC:247.8063888631171
import statsmodels.api as sm
mod = sm.tsa.statespace.SARIMAX(train_last_twoyears, order= (1,1,2), seasonal_order= (0,1,0,12), enforce_stationarity= False,
enforce_invertibility= False)
results = mod.fit()
print(results.summary())
residuals = pd.DataFrame(results.resid)
fig, ax = plt.subplots(1,2)
plt.figure(figsize=(12,12))
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()
SARIMAX model based on Week
os.chdir(r"/content/drive/My Drive/PHD")
train=pd.read_csv('train.csv')
train['date']=pd.to_datetime(train['date'])
train['Week']=train.date.dt.to_period(freq = 'W')
train.head()
train_last_oneyear=pd.DataFrame(train[(train.date>='2018-01-01')])
train_last_oneyear.shape
train_last_oneyear=(train_last_oneyear.groupby('Week')['unit_sales'].sum().reset_index())
train_last_oneyear=train_last_oneyear.set_index('Week')
train_last_oneyear.head()
import itertools
p = q = range(0, 4)
d =range(0,2)
pdq = list(itertools.product(p,d,q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]
print('Examples of parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[1]))
print('SARIMAX: {} x {}'.format(pdq[1], seasonal_pdq[2]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[3]))
print('SARIMAX: {} x {}'.format(pdq[2], seasonal_pdq[4]))
for param in pdq:
for param_seasonal in seasonal_pdq:
try:
mod = sm.tsa.statespace.SARIMAX(train_last_oneyear,
order=param,
seasonal_order=param_seasonal,
enforce_stationarity=False,
enforce_invertibility=False)
results = mod.fit()
print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
except:
continue
ARIMA(3, 1, 0)x(3, 1, 0, 12)12 - AIC:43.8863383319909
import statsmodels.api as sm
mod = sm.tsa.statespace.SARIMAX(train_last_oneyear, order= (3,1,0), seasonal_order= (3,1,0,12), enforce_stationarity= False,
enforce_invertibility= False)
results = mod.fit()
print(results.summary())
residuals = pd.DataFrame(results.resid)
fig, ax = plt.subplots(1,2)
plt.figure(figsize=(20,12))
residuals.plot(title="Residuals", ax=ax[0])
residuals.plot(kind='kde', title='Density', ax=ax[1])
plt.show()
pred = results.get_prediction(start=pd.to_datetime('2019-01-01'), dynamic=False)
pred_ci = pred.conf_int()
ax = train_last_oneyear['2015':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7))
ax.fill_between(pred_ci.index,
pred_ci.iloc[:, 0],
pred_ci.iloc[:, 1], color='k', alpha=.2)
ax.set_xlabel('Date')
ax.set_ylabel('Unit Sales')
plt.legend()
plt.show()
pred_uc = results.get_forecast(steps=4)
pred_ci = pred_uc.conf_int()
ax = train_last_oneyear.plot(label='observed', figsize=(14, 7))
pred_uc.predicted_mean.plot(ax=ax, label='Forecast')
ax.fill_between(pred_ci.index,
pred_ci.iloc[:, 0],
pred_ci.iloc[:, 1], color='k', alpha=.25)
ax.set_xlabel('Date')
ax.set_ylabel('Furniture Sales')
plt.legend()
plt.show()